## ── Attaching packages ────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 2.2.1 ✔ purrr 0.2.4
## ✔ tibble 1.4.2 ✔ dplyr 0.7.4
## ✔ tidyr 0.8.0 ✔ stringr 1.3.0
## ✔ readr 1.1.1 ✔ forcats 0.3.0
## ── Conflicts ───────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
## Loading required package: magrittr
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
##
## set_names
## The following object is masked from 'package:tidyr':
##
## extract
We must filter our BLASTn quantification results by BGC coverage criteria (>=50).
#load in quantifier data
quantifier_df <- read_delim("combined-MetaHIT_Gut-spanish-quantifier-results.txt", col_names = F, delim = "\t")
## Parsed with column specification:
## cols(
## X1 = col_character(),
## X2 = col_double(),
## X3 = col_double(),
## X4 = col_character()
## )
names(quantifier_df) <- c("bgcName", "Coverage", "RPKM", "Sample") # add column names
#Filter data by coverage
filter_quantifier_df <- quantifier_df %>% filter(Coverage >= 50)
sample_metadata <- read_csv("sample_metadata.csv", col_names = T) # load metadata
## Parsed with column specification:
## cols(
## Sample = col_character(),
## GroupAttribute = col_character()
## )
#add sample metadata to filtered quantifier dataframe
filter_quantifier_df <- filter_quantifier_df %>% inner_join(., sample_metadata, by = "Sample")
# retrieve BGC class from bgcName
results_df <- filter_quantifier_df %>% separate(., bgcName,c("sampleID", "scaffoldName", "clusterlen", "bgcClass", "software", "Coord"),sep = "__", remove = F) %>% select(-c(2:4,6:7))
############################
#Group BGC Class for figure
recode_bgcClass<-function(df){
PKS <- c("t2pks","transatpks","t1pks","lantipeptide-t2pks","otherks","otherks-transatpks")
NRPS <- c("nrps","nrps-bacteriocin","nrps-ladderane","ladderane-nrps","nrps-lantipeptide","arylpolyene-nrps")
Terpene <- "terpene"
hybrid_pks_nrps<- c("nrps-t1pks","transatpks-otherks-nrps")
Others <- c("arylpolyene","other", "resorcinol","butyrolactone", "ladderane", "siderophore", "hserlactone", "bacteriocin-arylpolyene","amglyccycl","ladderane-arylpolyene")
RiPPs <- c("microcin","bacteriocin","lantipeptide","lassopeptide","bacteriocin-lantipeptide","sactipeptide", "thiopeptide","bacteriocin-proteusin", "glycocin","microcin-lassopeptide")
df$bgcClass[df$bgcClass %in% PKS ==TRUE] <- "PKS"
df$bgcClass[df$bgcClass %in% NRPS ==TRUE] <- "NRPS"
df$bgcClass[df$bgcClass == Terpene] <- "Terpene"
df$bgcClass[df$bgcClass %in% hybrid_pks_nrps ==TRUE] <- "Hybrid PKS-NRPS"
df$bgcClass[df$bgcClass %in% Others ==TRUE] <- "Others"
df$bgcClass[df$bgcClass %in% RiPPs ==TRUE] <- "RiPPs"
return(df)
}
spanish_filtered_results<-recode_bgcClass(results_df)
#############################
#load in uniprot species data collected from our species profiler
species_df <- read_csv("species-profiler-results/species_results-MetaHIT-spanish_UniProt.csv", col_names = T)
## Parsed with column specification:
## cols(
## BGC_NAME = col_character(),
## TAXA_NAME = col_character(),
## ACC_ID = col_character(),
## PERC_IDENT = col_double(),
## COVERAGE = col_integer(),
## ORGANISM = col_character(),
## PHYLUM = col_character(),
## Manual_PHYLUM = col_character(),
## Manual_ORGANISM = col_character()
## )
species_df[is.na(species_df)]<-"N/A"
species_df$PHYLUM <- ifelse(species_df$PHYLUM == "N/A", species_df$Manual_PHYLUM, species_df$PHYLUM)
species_df$ORGANISM <- ifelse(species_df$ORGANISM == "N/A", species_df$Manual_ORGANISM, species_df$ORGANISM)
species_df$PHYLUM[species_df$PHYLUM == "N/A" ] <-"Unassigned"
species_df_bgc_class <-species_df %>% separate(., BGC_NAME,c("sampleID", "scaffoldName", "clusterlen", "bgcClass", "software", "Coord"),sep = "__", remove = F) %>% select(-c(2:4,6:7))
species_recoded_bgc_results<-recode_bgcClass(species_df_bgc_class)
#Plot BGC counts per Phyla detected
species_recoded_bgc_results %>% group_by(PHYLUM,bgcClass) %>% count() %>% ggplot(.,aes(reorder(PHYLUM, n), n)) + geom_col(aes(fill = bgcClass)) + scale_fill_npg() + coord_flip()+ theme_bw()
#ggsave("BGC-counts-perclass_Phyla-barplot.eps", width = 7.3, height = 6.31, units = "in", device = "eps")
species_recoded_bgc_results %>% group_by(PHYLUM) %>% count() %>% ggplot(.,aes(reorder(PHYLUM, n), n)) + geom_col() + scale_fill_npg() + coord_flip()+ theme_bw() + xlab("Phyla") + ylab("Total BGCs detected") +geom_text(aes(label=n), vjust=0,hjust=-.5)
#ggsave("BGC-counts-perPhyla-barplot.eps", width =11, height = 6.31, units = "in", device = "eps")
Summary statistics for filtered data per cohort.
### calculate hit ratio for each BGC within each cohort
calculate_hitRatio <-function(data, metadata){
metadata_counts<- metadata %>% group_by(GroupAttribute) %>% count() %>% ungroup()
bgc_counts <- data %>% group_by(bgcName,GroupAttribute) %>% count() %>% ungroup()
hitRatio_df <- data.frame() # intialize results df
for (i in 1:nrow(metadata_counts)){
diseaseStatus <- metadata_counts[i,]$GroupAttribute
diseaseStatus_count <- metadata_counts[i,]$n
disease_hitratio <- bgc_counts %>% filter(GroupAttribute == diseaseStatus) %>% mutate(., hitRatio = case_when(GroupAttribute == diseaseStatus ~ (n/diseaseStatus_count)*100))
hitRatio_df<-rbind(hitRatio_df,disease_hitratio)
}
return(hitRatio_df)
}
spanish_hitratio <- calculate_hitRatio(spanish_filtered_results, sample_metadata)
################################################
#Quanitfication of the exact CB and CC clusters for MetaHIT
#rainin_df <- read_delim("combined-MetaHIT_Gut-rainin-quantifier-results.txt", delim = "\t",col_names = FALSE)%>%filter(X2 >= 50) %>% inner_join(., sample_metadata, by = c("X4"="Sample"))
#names(rainin_df)[1:4] <- c("bgcName", "Coverage", "RPKM", "Sample") # add column names
######################
#Quanitfication of the exact CB and CC clusters for MetaHIT with unique reads mapped to each clusters
rainin_df <- read_delim("smNRPS-exact-unique-reads/combined-MetaHIT_Gut-rainin-exact-unique-reads-quantifier-results.txt", delim = "\t",col_names = FALSE)%>%filter(X2 >= 50) %>% inner_join(., sample_metadata, by = c("X4"="Sample"))
## Parsed with column specification:
## cols(
## X1 = col_character(),
## X2 = col_double(),
## X3 = col_double(),
## X4 = col_character()
## )
names(rainin_df)[1:4] <- c("bgcName", "Coverage", "RPKM", "Sample") # add column names
###########################
#rainin_df <- read_delim("smNRPS-exact-unique-reads-updated/combined-MetaHIT_Gut-rainin-exact-unique-reads-updated-quantifier-results.txt", delim = "\t",col_names = FALSE)%>%filter(X2 >= 50) %>% inner_join(., sample_metadata, by = c("X4"="Sample"))
#names(rainin_df)[1:4] <- c("bgcName", "Coverage", "RPKM", "Sample") # add column names
#####################################
rainin_df_hitratio <- calculate_hitRatio(rainin_df, sample_metadata)
prepare_rainin_data <- function(df, metadata_df){
missingsamples<- metadata_df %>% filter(!Sample %in% df$Sample)
df_wide <- dcast(df, bgcName ~ Sample, value.var="RPKM")
i<-1
for (i in 1:nrow(missingsamples)){
sample_col <- missingsamples[i,]$Sample
df_wide[, sample_col]<-NA
i<- i+1
}
df_wide[is.na(df_wide)] <- 0
df_melt <- melt(df_wide) %>% mutate_if(is.factor, as.character) %>% inner_join(.,metadata_df, by = c("variable"="Sample") )
names(df_melt)[2]<-"Sample"
names(df_melt)[3]<-"RPKM"
return(df_melt)
}
rainin_df_update<- prepare_rainin_data(rainin_df, sample_metadata)
## Using bgcName as id variables
rainin_df_update %>% filter(bgcName !="smNRPS-BP") %>% group_by(GroupAttribute,bgcName) %>% mutate(ymean= mean(RPKM)) %>% mutate(ymedian= median(RPKM)) %>% ggplot(., aes(Sample,RPKM)) + geom_col()+geom_hline(aes(yintercept=ymedian), color="red", linetype="dashed")+geom_hline(aes(yintercept=ymean), color="red", linetype="solid")+theme_pubclean()+ facet_grid(bgcName~GroupAttribute, switch= "x",scales = "free_x", space = "free_x") + xlab("Disease")+ theme(
axis.text.x=element_blank(),
axis.ticks.x=element_blank()) + theme(plot.title = element_text(hjust = 0.5,face = "bold",size = "24"))
#ggsave("combined-smNRPS-MetaHIT-barplot.eps", width = 11, height = 5, units = "in", device = "eps")
#ggsave("combined-smNRPS-exact-unique-reads-MetaHIT-barplot.eps", width = 11, height = 5, units = "in", device = "eps")
#ggsave("combined-smNRPS-exact-unique-reads-MetaHIT-update-barplot.eps", width = 11, height = 5, units = "in", device = "eps")
rainin_df_update %>% filter(bgcName !="smNRPS-BP") %>% group_by(GroupAttribute) %>% mutate(ymean= mean(RPKM)) %>% mutate(ymedian= median(RPKM)) %>% ggplot(., aes(Sample,RPKM)) + geom_bar(stat = "identity", aes(fill=bgcName))+ theme_pubclean()+geom_hline(aes(yintercept=ymedian), color="black", linetype="dashed")+geom_hline(aes(yintercept=ymean), color="black", linetype="solid")+ facet_grid(~GroupAttribute, switch= "x",scales = "free_x", space = "free_x") + xlab("Disease")+ theme(
axis.text.x=element_blank(),
axis.ticks.x=element_blank()) + theme(plot.title = element_text(hjust = 0.5,face = "bold",size = "24")) + scale_fill_npg() + coord_cartesian(ylim=c(0, 20))
#ggsave("combined-smNRPS-exact-unique-reads-MetaHIT-barplot-combined-max20.eps", width = 11, height = 5, units = "in", device = "eps")
rainin_df_update %>% filter(bgcName !="smNRPS-BP") %>% group_by(GroupAttribute) %>% mutate(ymean= mean(RPKM)) %>% mutate(ymedian= median(RPKM)) %>% ggplot(., aes(Sample,RPKM)) + geom_bar(stat = "identity", aes(fill=bgcName))+ theme_pubclean()+geom_hline(aes(yintercept=ymedian), color="black", linetype="dashed")+geom_hline(aes(yintercept=ymean), color="black", linetype="solid")+ facet_grid(~GroupAttribute, switch= "x",scales = "free_x", space = "free_x") + xlab("Disease")+ theme(
axis.text.x=element_blank(),
axis.ticks.x=element_blank()) + theme(plot.title = element_text(hjust = 0.5,face = "bold",size = "24")) + scale_fill_npg()
#ggsave("combined-smNRPS-exact-unique-reads-MetaHIT-barplot-combined.eps", width = 11, height = 5, units = "in", device = "eps")
#######################################################################################################
bar_plot_per_cluster<- function(df, cluster){
plot_title <- paste0(cluster, "-","barplot",".eps")
df %>% filter(bgcName == cluster) %>% group_by(GroupAttribute) %>% mutate(ymean= mean(RPKM)) %>% mutate(ymedian= median(RPKM)) %>% ggplot(., aes(Sample,RPKM)) + geom_col()+geom_hline(aes(yintercept=ymedian), color="red", linetype="dashed")+geom_hline(aes(yintercept=ymean), color="red", linetype="solid")+theme_pubclean()+ facet_grid(~GroupAttribute, switch= "x",scales = "free_x", space = "free_x") + xlab("Disease")+ theme(
axis.text.x=element_blank(),
axis.ticks.x=element_blank()) + ggtitle(cluster) + theme(plot.title = element_text(hjust = 0.5,face = "bold",size = "24"))
#ggsave(plot_title, width = 11, height = 5, units = "in", device = "eps")
}
bar_plot_per_cluster(rainin_df_update, "smNRPS-CC" )
bar_plot_per_cluster(rainin_df_update, "smNRPS-CB" )
#bar_plot_per_cluster(rainin_df_update, "smNRPS-BP" )
################################################
#smNRPS-CB:V1.UC61.0__NODE_930_length_26002_cov_3.77986__26002__nrps__ANTISMASH__0_26002
spanish_hitratio %>% filter(bgcName == "V1.UC61.0__NODE_930_length_26002_cov_3.77986__26002__nrps__ANTISMASH__0_26002")
## # A tibble: 3 x 4
## bgcName GroupAttribute n hitRatio
## <chr> <chr> <int> <dbl>
## 1 V1.UC61.0__NODE_930_length_26002_cov_3.77… C 5 7.04
## 2 V1.UC61.0__NODE_930_length_26002_cov_3.77… CD 13 61.9
## 3 V1.UC61.0__NODE_930_length_26002_cov_3.77… UC 12 9.45
#smNRPS-CC:V1.CD18.3__NODE_78_length_114577_cov_12.9704__43190__nrps__ANTISMASH__49316_92506
spanish_hitratio %>% filter(bgcName == "V1.CD18.3__NODE_78_length_114577_cov_12.9704__43190__nrps__ANTISMASH__49316_92506")
## # A tibble: 3 x 4
## bgcName GroupAttribute n hitRatio
## <chr> <chr> <int> <dbl>
## 1 V1.CD18.3__NODE_78_length_114577_cov_12.9… C 3 4.23
## 2 V1.CD18.3__NODE_78_length_114577_cov_12.9… CD 17 81.0
## 3 V1.CD18.3__NODE_78_length_114577_cov_12.9… UC 5 3.94
suppressWarnings(describeBy(spanish_hitratio, spanish_hitratio$GroupAttribute)) # #Stats for Hit Ratio
##
## Descriptive statistics by group
## group: C
## vars n mean sd median trimmed mad min max range
## bgcName* 1 818 NaN NA NA NaN NA Inf -Inf -Inf
## GroupAttribute* 2 818 NaN NA NA NaN NA Inf -Inf -Inf
## n 3 818 15.53 17.25 8.00 12.26 8.90 1.00 70.00 69.00
## hitRatio 4 818 21.88 24.29 11.27 17.26 12.53 1.41 98.59 97.18
## skew kurtosis se
## bgcName* NA NA NA
## GroupAttribute* NA NA NA
## n 1.43 1.01 0.60
## hitRatio 1.43 1.01 0.85
## --------------------------------------------------------
## group: CD
## vars n mean sd median trimmed mad min max range
## bgcName* 1 473 NaN NA NA NaN NA Inf -Inf -Inf
## GroupAttribute* 2 473 NaN NA NA NaN NA Inf -Inf -Inf
## n 3 473 4.92 4.53 3.00 4.07 2.97 1.00 20.00 19.00
## hitRatio 4 473 23.44 21.58 14.29 19.37 14.12 4.76 95.24 90.48
## skew kurtosis se
## bgcName* NA NA NA
## GroupAttribute* NA NA NA
## n 1.43 1.42 0.21
## hitRatio 1.43 1.42 0.99
## --------------------------------------------------------
## group: UC
## vars n mean sd median trimmed mad min max
## bgcName* 1 879 NaN NA NA NaN NA Inf -Inf
## GroupAttribute* 2 879 NaN NA NA NaN NA Inf -Inf
## n 3 879 23.68 28.79 10.00 17.72 11.86 1.00 120.00
## hitRatio 4 879 18.65 22.67 7.87 13.95 9.34 0.79 94.49
## range skew kurtosis se
## bgcName* -Inf NA NA NA
## GroupAttribute* -Inf NA NA NA
## n 119.0 1.58 1.51 0.97
## hitRatio 93.7 1.58 1.51 0.76
suppressWarnings(describeBy(spanish_filtered_results, spanish_filtered_results$GroupAttribute)) # Stats for RPKM
##
## Descriptive statistics by group
## group: C
## vars n mean sd median trimmed mad min max
## bgcName* 1 12707 NaN NA NA NaN NA Inf -Inf
## bgcClass* 2 12707 NaN NA NA NaN NA Inf -Inf
## Coverage 3 12707 77.28 15.47 77.95 77.60 20.88 50.00 100.00
## RPKM 4 12707 3.34 5.72 1.49 2.12 1.48 0.06 126.82
## Sample* 5 12707 NaN NA NA NaN NA Inf -Inf
## GroupAttribute* 6 12707 NaN NA NA NaN NA Inf -Inf
## range skew kurtosis se
## bgcName* -Inf NA NA NA
## bgcClass* -Inf NA NA NA
## Coverage 50.00 -0.12 -1.29 0.14
## RPKM 126.76 6.15 67.98 0.05
## Sample* -Inf NA NA NA
## GroupAttribute* -Inf NA NA NA
## --------------------------------------------------------
## group: CD
## vars n mean sd median trimmed mad min max
## bgcName* 1 2328 NaN NA NA NaN NA Inf -Inf
## bgcClass* 2 2328 NaN NA NA NaN NA Inf -Inf
## Coverage 3 2328 78.61 15.67 79.92 79.18 20.53 50.01 100.00
## RPKM 4 2328 7.18 12.03 2.32 4.33 2.66 0.08 115.83
## Sample* 5 2328 NaN NA NA NaN NA Inf -Inf
## GroupAttribute* 6 2328 NaN NA NA NaN NA Inf -Inf
## range skew kurtosis se
## bgcName* -Inf NA NA NA
## bgcClass* -Inf NA NA NA
## Coverage 49.99 -0.21 -1.26 0.32
## RPKM 115.75 3.35 15.25 0.25
## Sample* -Inf NA NA NA
## GroupAttribute* -Inf NA NA NA
## --------------------------------------------------------
## group: UC
## vars n mean sd median trimmed mad min max
## bgcName* 1 20818 NaN NA NA NaN NA Inf -Inf
## bgcClass* 2 20818 NaN NA NA NaN NA Inf -Inf
## Coverage 3 20818 77.63 15.53 78.31 77.99 21.02 50.00 100.00
## RPKM 4 20818 3.54 7.04 1.65 2.27 1.67 0.09 436.45
## Sample* 5 20818 NaN NA NA NaN NA Inf -Inf
## GroupAttribute* 6 20818 NaN NA NA NaN NA Inf -Inf
## range skew kurtosis se
## bgcName* -Inf NA NA NA
## bgcClass* -Inf NA NA NA
## Coverage 50.00 -0.13 -1.30 0.11
## RPKM 436.36 22.90 1187.34 0.05
## Sample* -Inf NA NA NA
## GroupAttribute* -Inf NA NA NA
spanish_filtered_results %>% group_by(GroupAttribute) %>% count(Sample) %>% describeBy(., .$GroupAttribute)# Stats for Total BGCs in each sample
## Warning in FUN(data[x, , drop = FALSE], ...): NAs introduced by coercion
## Warning in FUN(data[x, , drop = FALSE], ...): NAs introduced by coercion
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning
## Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning
## Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning
## -Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning
## -Inf
## Warning in FUN(data[x, , drop = FALSE], ...): NAs introduced by coercion
## Warning in FUN(data[x, , drop = FALSE], ...): NAs introduced by coercion
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning
## Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning
## Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning
## -Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning
## -Inf
## Warning in FUN(data[x, , drop = FALSE], ...): NAs introduced by coercion
## Warning in FUN(data[x, , drop = FALSE], ...): NAs introduced by coercion
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning
## Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning
## Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning
## -Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning
## -Inf
##
## Descriptive statistics by group
## group: C
## vars n mean sd median trimmed mad min max range
## GroupAttribute* 1 71 NaN NA NA NaN NA Inf -Inf -Inf
## Sample* 2 71 NaN NA NA NaN NA Inf -Inf -Inf
## n 3 71 178.97 71.98 186 184.68 65.23 2 315 313
## skew kurtosis se
## GroupAttribute* NA NA NA
## Sample* NA NA NA
## n -0.71 -0.02 8.54
## --------------------------------------------------------
## group: CD
## vars n mean sd median trimmed mad min max range
## GroupAttribute* 1 21 NaN NA NA NaN NA Inf -Inf -Inf
## Sample* 2 21 NaN NA NA NaN NA Inf -Inf -Inf
## n 3 21 110.86 53.26 113 110.06 54.86 12 214 202
## skew kurtosis se
## GroupAttribute* NA NA NA
## Sample* NA NA NA
## n 0.09 -0.88 11.62
## --------------------------------------------------------
## group: UC
## vars n mean sd median trimmed mad min max range
## GroupAttribute* 1 127 NaN NA NA NaN NA Inf -Inf -Inf
## Sample* 2 127 NaN NA NA NaN NA Inf -Inf -Inf
## n 3 127 163.92 72.48 179 168.23 62.27 2 297 295
## skew kurtosis se
## GroupAttribute* NA NA NA
## Sample* NA NA NA
## n -0.59 -0.59 6.43
# Try to order facets.
reorder_within <- function(x, by, within, fun = mean, sep = "___", ...) {
new_x <- paste(x, within, sep = sep)
stats::reorder(new_x, by, FUN = fun)
}
scale_x_reordered <- function(..., sep = "___") {
reg <- paste0(sep, ".+$")
ggplot2::scale_x_discrete(labels = function(x) gsub(reg, "", x), ...)
}
scale_y_reordered <- function(..., sep = "___") {
reg <- paste0(sep, ".+$")
ggplot2::scale_y_discrete(labels = function(x) gsub(reg, "", x), ...)
}
########
#figure for BGC cclass total molecules detected in each samples in each cohort
spanish_filtered_results %>% group_by(GroupAttribute,bgcClass) %>% count(Sample) %>% ggplot(.,aes(reorder_within(Sample, n, GroupAttribute), n)) + geom_col(aes(fill = bgcClass)) + scale_fill_npg()+ facet_grid(~GroupAttribute, switch = "x", scales = "free_x", space = "free_x") + scale_x_reordered()+
theme(panel.spacing = unit(0, "lines"),
strip.background = element_blank()) + xlab("Disease Status")+ ylab("Total BGCs detected") + theme_bw() + theme(axis.text.x = element_blank(),axis.ticks.x = element_blank())
#########
wilcoxon_test<-function(df, cohort, testType){
df$GroupAttribute <-as.factor(df$GroupAttribute)
print(levels(df$GroupAttribute))
results<-df %>% group_by(GroupAttribute) %>% count(Sample) %>% wilcox.test(n ~ GroupAttribute, data=., paired=F, alternative = testType,subset = GroupAttribute %in% cohort )
print(results)
}
#levels = "C" "CD" "UC"
wilcoxon_test(spanish_filtered_results, c("C", "CD"), "greater") # twosided=8.806e-05/ 4.403e-05
## [1] "C" "CD" "UC"
##
## Wilcoxon rank sum test with continuity correction
##
## data: n by GroupAttribute
## W = 1167.5, p-value = 4.403e-05
## alternative hypothesis: true location shift is greater than 0
wilcoxon_test(spanish_filtered_results, c("CD", "UC"), "less") # twosided=0.0008254,0.0004127
## [1] "C" "CD" "UC"
##
## Wilcoxon rank sum test with continuity correction
##
## data: n by GroupAttribute
## W = 724.5, p-value = 0.0004127
## alternative hypothesis: true location shift is less than 0
wilcoxon_test(spanish_filtered_results, c("C", "UC"), "greater")#twosided=0.175, 0.08748
## [1] "C" "CD" "UC"
##
## Wilcoxon rank sum test with continuity correction
##
## data: n by GroupAttribute
## W = 5033.5, p-value = 0.08748
## alternative hypothesis: true location shift is greater than 0
############
#Determine the mean BGCs detected in each cohort pairs
spanish_filtered_results %>% filter(GroupAttribute %in% c("C", "CD")) %>% count(Sample) %>% inner_join(., sample_metadata) %>% group_by(GroupAttribute) %>% mutate(meanBGC = mean(n)) %>% distinct(meanBGC, GroupAttribute)
## Joining, by = "Sample"
## # A tibble: 2 x 2
## # Groups: GroupAttribute [2]
## GroupAttribute meanBGC
## <chr> <dbl>
## 1 C 179.
## 2 CD 111.
spanish_filtered_results %>% filter(GroupAttribute %in% c("C", "UC")) %>% count(Sample) %>% inner_join(., sample_metadata) %>% group_by(GroupAttribute) %>% mutate(meanBGC = mean(n)) %>% distinct(meanBGC, GroupAttribute)
## Joining, by = "Sample"
## # A tibble: 2 x 2
## # Groups: GroupAttribute [2]
## GroupAttribute meanBGC
## <chr> <dbl>
## 1 UC 164.
## 2 C 179.
spanish_filtered_results %>% filter(GroupAttribute == "C" | GroupAttribute == "CD") %>% group_by(GroupAttribute) %>% count(Sample)%>% ggplot(., aes(GroupAttribute, n, color = GroupAttribute)) + geom_boxplot(outlier.colour = NA) + geom_jitter(width = 0.2)+stat_summary(fun.y = mean, geom = "errorbar", aes(ymax = ..y.., ymin = ..y..), width = .75, linetype = "dashed")+scale_color_npg()+ theme_classic() +annotate("text", x = 1.5, y = 300, label = "Wilcoxon, p = 4.403e-05")+stat_compare_means(comparisons = list( c("C", "CD")),label = "p.signif", paired =F)
spanish_filtered_results %>% filter(GroupAttribute == "UC" | GroupAttribute == "CD") %>% group_by(GroupAttribute) %>% count(Sample)%>% ggplot(., aes(GroupAttribute, n, color = GroupAttribute)) + geom_boxplot(outlier.colour = NA) + geom_jitter(width = 0.2)+stat_summary(fun.y = mean, geom = "errorbar", aes(ymax = ..y.., ymin = ..y..), width = .75, linetype = "dashed")+scale_color_npg()+ theme_classic() +annotate("text", x = 1.5, y = 300, label = "Wilcoxon, p = 0.0004127")+stat_compare_means(comparisons = list( c("UC", "CD")),label = "p.signif", paired =F)
spanish_filtered_results %>% filter(GroupAttribute == "UC" | GroupAttribute == "C") %>% group_by(GroupAttribute) %>% count(Sample)%>% ggplot(., aes(GroupAttribute, n, color = GroupAttribute)) + geom_boxplot(outlier.colour = NA) + geom_jitter(width = 0.2)+stat_summary(fun.y = mean, geom = "errorbar", aes(ymax = ..y.., ymin = ..y..), width = .75, linetype = "dashed")+scale_color_npg()+ theme_classic() +annotate("text", x = 1.5, y = 300, label = "Wilcoxon, p = 0.08748")+stat_compare_means(comparisons = list( c("UC", "C")),label = "p.format", paired =F)
Analysis for Crohns vs Controls: iterative KW test; LDA analysis; Random forest
# Function to add BGCs that zeros when a BGC was not found in a Sample
prepare_data<-function(df, metadata_df){
data_wide <- dcast(df, bgcName ~ Sample, value.var="RPKM")
data_wide[is.na(data_wide)] <- 0
data_melt <- melt(data_wide) %>% mutate_if(is.factor, as.character) %>% inner_join(.,metadata_df, by = c("variable"="Sample") )
names(data_melt)[2]<-"Sample"
names(data_melt)[3]<-"RPKM"
return(data_melt)
}
filter_quantifier_df_updated <- prepare_data(filter_quantifier_df, sample_metadata)
## Using bgcName as id variables
quantifier_CD_C <-filter_quantifier_df_updated %>% filter(GroupAttribute == "CD" | GroupAttribute == "C")
quantifier_UC_C <-filter_quantifier_df_updated %>% filter(GroupAttribute == "UC" | GroupAttribute == "C")
quantifier_UC_CD <-filter_quantifier_df_updated %>% filter(GroupAttribute == "UC" | GroupAttribute == "CD")
################
#Rainin not exact clusters barplot
bar_plot_per_cluster(filter_quantifier_df_updated, "V1.UC61.0__NODE_930_length_26002_cov_3.77986__26002__nrps__ANTISMASH__0_26002" ) #smNRPS CB
bar_plot_per_cluster(filter_quantifier_df_updated, "V1.CD18.3__NODE_78_length_114577_cov_12.9704__43190__nrps__ANTISMASH__49316_92506" ) #smNRPS CC
###############
#Enriched CD E.coli dervived clusters
bar_plot_per_cluster(filter_quantifier_df_updated, "V1.UC61.0__NODE_228_length_65988_cov_4.67854__26289__nrps__ANTISMASH__39699_65988" )
bar_plot_per_cluster(filter_quantifier_df_updated, "V1.UC61.0__NODE_2273_length_12464_cov_2.76678__12464__t1pks__ANTISMASH__0_12464" )
bar_plot_per_cluster(filter_quantifier_df_updated,"V1.CD41.0__NODE_43_length_139509_cov_97.9643__53525__nrps__ANTISMASH__80787_134312")
bar_plot_per_cluster(filter_quantifier_df_updated,"V1.UC5.0__NODE_2999_length_8272_cov_2.30449__8272__bacteriocin__ANTISMASH__0_8272")
bar_plot_per_cluster(filter_quantifier_df_updated,"V1.CD35.1__NODE_3185_length_6846_cov_4.30231__6846__siderophore__ANTISMASH__0_6846")
bar_plot_per_cluster(filter_quantifier_df_updated,"V1.CD10.0__NODE_453_length_17115_cov_2.32239__17115__siderophore__ANTISMASH__0_17115")
bar_plot_per_cluster(filter_quantifier_df_updated,"V1.CD18.3__NODE_40_length_166683_cov_10.5209__59165__nrps-t1pks__ANTISMASH__61574_120739")
ecoli_bgcs<-c("V1.UC61.0__NODE_228_length_65988_cov_4.67854__26289__nrps__ANTISMASH__39699_65988","V1.UC61.0__NODE_2273_length_12464_cov_2.76678__12464__t1pks__ANTISMASH__0_12464","V1.CD41.0__NODE_43_length_139509_cov_97.9643__53525__nrps__ANTISMASH__80787_134312","V1.UC5.0__NODE_2999_length_8272_cov_2.30449__8272__bacteriocin__ANTISMASH__0_8272", "V1.CD35.1__NODE_3185_length_6846_cov_4.30231__6846__siderophore__ANTISMASH__0_6846", "V1.CD10.0__NODE_453_length_17115_cov_2.32239__17115__siderophore__ANTISMASH__0_17115", "V1.CD18.3__NODE_40_length_166683_cov_10.5209__59165__nrps")
ecoli_bgc_data<- filter_quantifier_df %>% filter(bgcName %in% ecoli_bgcs)
#################
#Format dataframe to run lefse
source("format_data_for_lefse.R")
quantifier_CD_C_lefse<-format_data_for_lefse(quantifier_CD_C)
quantifier_UC_C_lefse<-format_data_for_lefse(quantifier_UC_C)
quantifier_UC_CD_lefse<-format_data_for_lefse(quantifier_UC_CD)
#write_delim(quantifier_CD_C_lefse, "quantifier-results-CD_C-spanish-for-lefse.txt",delim="\t")
#write_delim(quantifier_UC_C_lefse, "quantifier-results-UC_C-spanish-for-lefse.txt",delim="\t")
#write_delim(quantifier_UC_CD_lefse, "quantifier-results-UC_CD-spanish-for-lefse.txt",delim="\t")
# Analysis with pvalue <= .01; LDA Score >2
########################################################
lefse_CD_C <- read_delim("LEfSe-results/LDA_Effect_Size_CD_C_stricter_pvalue.txt",col_names = F, delim = "\t")
## Parsed with column specification:
## cols(
## X1 = col_character(),
## X2 = col_double(),
## X3 = col_character(),
## X4 = col_double(),
## X5 = col_character()
## )
colnames(lefse_CD_C)<-c("BGC", "log(HighestMean_AllClasses)","class_discriminitive","LDA_score", "pvalue")
lefse_CD_C_discriminative_features <- lefse_CD_C %>% filter(!is.na(class_discriminitive) & !is.na(LDA_score))
lefse_CD_C_discriminative_features<-lefse_CD_C_discriminative_features %>% mutate(LDA_score = ifelse(class_discriminitive == "C", -abs(LDA_score), LDA_score))
lefse_CD_C_discriminative_features %>% mutate(BGC = reorder(BGC, LDA_score)) %>%
ggplot(aes(BGC, LDA_score, fill = class_discriminitive)) +
geom_bar(stat = "identity") +
ylab("LDA SCORE (log 10)") +
coord_flip() + theme_bw() + scale_fill_manual(values = c("#DC0000FF", "#00A087FF")) + theme(legend.position="top") +theme(legend.title=element_blank())
#ggsave("lefse_CD_C_discriminative_features.eps", width = 10, height = 15, units = "in", device = "eps")
########################################################
lefse_UC_C <- read_delim("LEfSe-results/LDA_Effect_Size_UC_C_stricter_pvalue.txt",col_names = F, delim = "\t")
## Parsed with column specification:
## cols(
## X1 = col_character(),
## X2 = col_double(),
## X3 = col_character(),
## X4 = col_double(),
## X5 = col_character()
## )
colnames(lefse_UC_C)<-c("BGC", "log(HighestMean_AllClasses)","class_discriminitive","LDA_score", "pvalue")
lefse_UC_C_discriminative_features <- lefse_UC_C %>% filter(!is.na(class_discriminitive) & !is.na(LDA_score))
lefse_UC_C_discriminative_features<-lefse_UC_C_discriminative_features %>% mutate(LDA_score = ifelse(class_discriminitive == "C", -abs(LDA_score), LDA_score))
lefse_UC_C_discriminative_features %>% mutate(BGC = reorder(BGC, LDA_score)) %>%
ggplot(aes(BGC, LDA_score, fill = class_discriminitive)) +
geom_bar(stat = "identity") +
ylab("LDA SCORE (log 10)") +
coord_flip() + theme_bw() + scale_fill_manual(values = c("#DC0000FF", "#00A087FF")) + theme(legend.position="top") +theme(legend.title=element_blank())
#ggsave("lefse_UC_C_discriminative_features.eps", width = 10, height = 15, units = "in", device = "eps")
########################################################
lefse_UC_CD <- read_delim("LEfSe-results/LDA_Effect_Size_UC_CD_stricter_pvalue.txt",col_names = F, delim = "\t")
## Parsed with column specification:
## cols(
## X1 = col_character(),
## X2 = col_double(),
## X3 = col_character(),
## X4 = col_double(),
## X5 = col_character()
## )
colnames(lefse_UC_CD)<-c("BGC", "log(HighestMean_AllClasses)","class_discriminitive","LDA_score", "pvalue")
lefse_UC_CD_discriminative_features <- lefse_UC_CD %>% filter(!is.na(class_discriminitive) & !is.na(LDA_score))
lefse_UC_CD_discriminative_features<-lefse_UC_CD_discriminative_features %>% mutate(LDA_score = ifelse(class_discriminitive == "CD", -abs(LDA_score), LDA_score))
lefse_UC_CD_discriminative_features %>% mutate(BGC = reorder(BGC, LDA_score)) %>%
ggplot(aes(BGC, LDA_score, fill = class_discriminitive)) +
geom_bar(stat = "identity") +
ylab("LDA SCORE (log 10)") +
coord_flip() + theme_bw() + scale_fill_manual(values = c("#DC0000FF", "#00A087FF")) + theme(legend.position="top") +theme(legend.title=element_blank())
#ggsave("lefse_UC_CD_discriminative_features.eps", width = 10, height = 10, units = "in", device = "eps")
#Venn diagram
#
# lefse_CD_C_discriminative_features
# lefse_UC_C_discriminative_features
# lefse_UC_CD_discriminative_features
#
# enrichedUC_C_list<-lefse_UC_C_discriminative_features%>% filter(class_discriminitive == "UC")
# enrichedUC_CD_list<-lefse_UC_CD_discriminative_features%>% filter(class_discriminitive == "UC")
# gplots::venn(list(enrichedUC_CD = enrichedUC_CD_list$BGC, enrichedUC_C= enrichedUC_C_list$BGC))
#iterative KW similarly to Lefse to retreive p-values for all BGCs and compare to Lefse results
#normalized by LEfse methods
iterativeKW<-function(df, features){
df<- df %>% group_by(Sample) %>% mutate(sumRPKM = sum(RPKM)) %>% mutate(normalizedRPKM = (RPKM/sumRPKM) * 1000000)
df$GroupAttribute <-as.factor(df$GroupAttribute)
features_pvalues<-list()
pvalues<-list()
i <-1
for (i in 1:length(features)){
featureDF<- df %>% filter(bgcName == features[i]) # current feature data
kwResult<-kruskal.test(normalizedRPKM ~ GroupAttribute, data = featureDF)
features_pvalues <- append(features_pvalues, list(features[i]))
pvalues<-append(pvalues, list(kwResult$p.value))
i<-i+1
}
resultsDF<-do.call(rbind, Map(data.frame, A=features_pvalues, B=pvalues))
names(resultsDF) <- c("features", "pvalues")
#resultsDF$BH.adjusted <- p.adjust(resultsDF$pvalues, method="BH", n = length(resultsDF$pvalues)) # adjust pvalues
return(resultsDF)
}
####################################
quantifier_CD_C_KW <- iterativeKW(quantifier_CD_C,unique(quantifier_CD_C$bgcName))
quantifier_CD_C_KW$pvalues[quantifier_CD_C_KW$pvalues == "NaN"]<- "-"
###################################
quantifier_UC_C_KW <- iterativeKW(quantifier_UC_C,unique(quantifier_UC_C$bgcName))
quantifier_UC_C_KW$pvalues[quantifier_UC_C_KW$pvalues == "NaN"]<- "-"
###################################
quantifier_UC_CD_KW <- iterativeKW(quantifier_UC_CD,unique(quantifier_UC_CD$bgcName))
quantifier_UC_CD_KW$pvalues[quantifier_UC_C_KW$pvalues == "NaN"]<- "-"
###################################
supplementary_tables<-function(df, kw_df,taxa_df){
new_df <- df %>% separate(., BGC, into=c("Sample","ScaffoldName", "bgcLen", "bgcClass", "Software","Coords" ), sep = "__")
recode_df<- new_df %>% mutate(Sample = str_replace_all(Sample, "_", ".")) %>% mutate(ScaffoldName=stringi::stri_replace_last_fixed(ScaffoldName, '_', '.')) %>% mutate(bgcClass=str_replace_all(bgcClass, '_', '-'))%>%unite(., "BGC",c("Sample","ScaffoldName", "bgcLen", "bgcClass", "Software","Coords" ), sep = "__")
recode_df[is.na(recode_df)]<-"-"
supp_table<-recode_df%>% inner_join(.,kw_df, by= c("BGC" = "features"))%>% inner_join(.,taxa_df, by = c("BGC"="BGC_NAME")) %>% select(., 1:6,8,11:12)
names(supp_table)[5]<-"LEfSe_pvalue"
names(supp_table)[6]<-"manual_pvalue"
names(supp_table)[7]<-"NCBI refrence sequence acession number"
names(supp_table)[8]<-"Taxa Name"
names(supp_table)[9]<-"Phylum"
return(supp_table)
}
lefse_CD_C_supp_table<-supplementary_tables(lefse_CD_C,quantifier_CD_C_KW,species_df)
## Warning: Column `BGC`/`features` joining character vector and factor,
## coercing into character vector
#write_csv(lefse_CD_C_supp_table, "lefse_CD_C_supp_table_1.csv", col_names = T)
lefse_UC_C_supp_table<-supplementary_tables(lefse_UC_C,quantifier_UC_C_KW,species_df)
## Warning: Column `BGC`/`features` joining character vector and factor,
## coercing into character vector
#write_csv(lefse_UC_C_supp_table, "lefse_UC_C_supp_table_2.csv", col_names = T)
lefse_UC_CD_supp_table<-supplementary_tables(lefse_UC_CD,quantifier_UC_CD_KW,species_df)
## Warning: Column `BGC`/`features` joining character vector and factor,
## coercing into character vector
#write_csv(lefse_UC_CD_supp_table, "lefse_UC_CD_supp_table_3.csv", col_names = T)